cc_number = function()# Get the credit card number
{
params$cc_number
}
cc_number=cc_number()
cc_number
## NULL
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(skimr)
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data = read.csv('clean_data.csv')
data = data %>% mutate(across(where(is.character),as.factor))
data$cc_num <- as.factor(data$cc_num)
data$zip <- as.factor(data$zip)
data$trans_date_trans_time = as_datetime(data$trans_date_trans_time)
data_bucket = data %>% group_by(cc_num) %>% summarize('count' = n())
library(cluster)
silhoette_width = sapply(2:20,
FUN = function(x) pam(x=data_bucket$count, k=x)$silinfo$avg.width)
ggplot(data=data.frame(cluster = 2:20,silhoette_width), aes(x=cluster,y=silhoette_width))+
geom_line(col='steelblue',size=1.2)+
geom_point()+
scale_x_continuous(breaks=seq(2,20,1))
set.seed(617)
km_profile = kmeans(x=data_bucket$count, centers=7, iter.max=10000, nstart=25)
k_segments_profile = km_profile$cluster
table(k_segments_profile)
## k_segments_profile
## 1 2 3 4 5 6 7
## 31 57 96 130 247 194 238
data_bucket = cbind(data_bucket, 'profile' = k_segments_profile)
cc_representative = c(
(data_bucket %>% arrange(desc(count)) %>% filter(profile == 1))[1,1],
(data_bucket %>% arrange(desc(count)) %>% filter(profile == 2))[2,1],
(data_bucket %>% arrange(desc(count)) %>% filter(profile == 3))[3,1],
(data_bucket %>% arrange(desc(count)) %>% filter(profile == 4))[4,1],
(data_bucket %>% arrange(desc(count)) %>% filter(profile == 5))[5,1],
(data_bucket %>% arrange(desc(count)) %>% filter(profile == 6))[6,1],
(data_bucket %>% arrange(desc(count)) %>% filter(profile == 7))[7,1]
)
library(svDialogs)
## Warning: package 'svDialogs' was built under R version 4.1.3
#cc_number <- dlgInput("Enter credit card number", Sys.info()["user"])$res
cc_number = 30270432095985
# cc_number = as.numeric(as.character(cc_representative[1]))
# cc_number = as.numeric(as.character(cc_representative[2]))
# cc_number = as.numeric(as.character(cc_representative[3]))
# cc_number = as.numeric(as.character(cc_representative[4]))
# cc_number = as.numeric(as.character(cc_representative[5]))
# cc_number = as.numeric(as.character(cc_representative[6]))
# cc_number = as.numeric(as.character(cc_representative[7]))
data_indiv = filter(data, cc_num == cc_number)
within_ss = sapply(1:10,FUN = function(x){
set.seed(617)
kmeans(x=data_indiv$amt, centers=x, iter.max=1000, nstart=25)$tot.withinss})
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 219600)
ggplot(data=data.frame(cluster = 1:10,within_ss), aes(x=cluster,y=within_ss))+
geom_line(col='steelblue',size=1.2)+
geom_point()+
scale_x_continuous(breaks=seq(1,10,1))
## Ratio plot
ratio_ss = sapply(1:10,FUN = function(x) {
set.seed(617)
km = kmeans(x=data_indiv$amt, centers=x, iter.max=1000, nstart=25)
km$betweenss/km$totss} )
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 219600)
ggplot(data=data.frame(cluster = 1:10,ratio_ss), aes(x=cluster,y=ratio_ss))+
geom_line(col='steelblue',size=1.2)+
geom_point()+
scale_x_continuous(breaks=seq(1,10,1))
## Silhouette
library(cluster)
silhoette_width = sapply(2:10,
FUN = function(x) pam(x=data_indiv$amt, k=x)$silinfo$avg.width)
ggplot(data=data.frame(cluster = 2:10,silhoette_width), aes(x=cluster,y=silhoette_width))+
geom_line(col='steelblue',size=1.2)+
geom_point()+
scale_x_continuous(breaks=seq(2,10,1))
d = dist(x = data_indiv$amt ,method = 'euclidean')
clusters = hclust(d = d,method='ward.D2')
h_segments = cutree(tree=clusters, k=9)
table(h_segments)
## h_segments
## 1 2 3 4 5 6 7 8 9
## 1833 1805 587 47 90 2 6 21 1
set.seed(617)
km = kmeans(x=data_indiv$amt, centers=9, iter.max=10000, nstart=25)
k_segments = km$cluster
table(k_segments)
## k_segments
## 1 2 3 4 5 6 7 8 9
## 687 2149 18 3 1348 1 131 15 40
library(mclust)
## Package 'mclust' version 5.4.9
## Type 'citation("mclust")' for citing this R package in publications.
m_clusters = Mclust(data=data_indiv$amt)
m_segments = m_clusters$classification
#sort(table(m_segments))
#plot(m_clusters, what = "density", xlim = c(0, 300))
plot(m_clusters, what = "uncertainty", xlim = c(0, 500))
## Warning in rug(data, lwd = 1, col = adjustcolor(par("fg"), alpha.f = 0.8)): some
## values will be clipped
plot(m_clusters, what = 'classification', xlim = c(0, 500))
cat = data_indiv %>% select_if(is.factor) %>% names()
cat_data = data_indiv[cat]
cat_data = cat_data[,c(3,13)]
library(klaR)
## Warning: package 'klaR' was built under R version 4.1.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
hasil = kmodes(cat_data, 7, iter.max = 7, weighted = FALSE, fast = TRUE)
kmode_segments = hasil$cluster
data_clusters = cbind(data_indiv, h_segments, k_segments, m_segments, kmode_segments)
data_clusters = data_clusters[,c(3,4,5,17,21:25)]
data_clusters$h_is_fraud_pred = 0
a = attributes(sort(table(h_segments))[1])$name
a = as.numeric(a)
b = attributes(sort(table(h_segments))[2])$name
b = as.numeric(b)
c = attributes(sort(table(h_segments))[3])$name
c = as.numeric(c)
d = attributes(sort(table(h_segments))[4])$name
d = as.numeric(d)
data_clusters$h_is_fraud_pred[data_clusters$h_segments == a] = 1
data_clusters$h_is_fraud_pred[data_clusters$h_segments == b] = 1
data_clusters$h_is_fraud_pred[data_clusters$h_segments == c] = 1
data_clusters$h_is_fraud_pred[data_clusters$h_segments == d] = 1
data_clusters$k_is_fraud_pred = 0
a = attributes(sort(table(k_segments))[1])$name
a = as.numeric(a)
b = attributes(sort(table(k_segments))[2])$name
b = as.numeric(b)
c = attributes(sort(table(k_segments))[3])$name
c = as.numeric(c)
d = attributes(sort(table(k_segments))[4])$name
d = as.numeric(d)
data_clusters$k_is_fraud_pred[data_clusters$k_segments == a] = 1
data_clusters$k_is_fraud_pred[data_clusters$k_segments == b] = 1
data_clusters$k_is_fraud_pred[data_clusters$k_segments == c] = 1
data_clusters$k_is_fraud_pred[data_clusters$k_segments == d] = 1
data_clusters$m_is_fraud_pred = 0
a = attributes(sort(table(m_segments))[1])$name
a = as.numeric(a)
b = attributes(sort(table(m_segments))[2])$name
b = as.numeric(b)
data_clusters$m_is_fraud_pred[data_clusters$m_segments == a] = 1
data_clusters$m_is_fraud_pred[data_clusters$m_segments == b] = 1
data_clusters$kmode_is_fraud_pred = 0
a = attributes(sort(table(kmode_segments))[1])$name
a = as.numeric(a)
b = attributes(sort(table(kmode_segments))[2])$name
b = as.numeric(b)
data_clusters$kmode_is_fraud_pred[data_clusters$kmode_segments == a] = 1
data_clusters$kmode_is_fraud_pred[data_clusters$kmode_segments == b] = 1
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
data_clusters$ensemble_pred = (data_clusters$m_is_fraud_pred | data_clusters$kmode_is_fraud_pred)
result_matrix = data_clusters[,c(4, 10:14)]
expected_value <- factor(result_matrix$is_fraud)
h_predicted_value <- factor(result_matrix$h_is_fraud_pred)
k_predicted_value <- factor(result_matrix$k_is_fraud_pred)
m_predicted_value <- factor(result_matrix$m_is_fraud_pred)
kmode_predicted_value <- factor(result_matrix$kmode_is_fraud_pred)
ensemble_predicted_value <- factor(result_matrix$ensemble_pred)
#Creating confusion matrix
h_cm <- confusionMatrix(data=h_predicted_value, reference = expected_value)
h_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4350 12
## 1 27 3
##
## Accuracy : 0.9911
## 95% CI : (0.9879, 0.9937)
## No Information Rate : 0.9966
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0.1294
##
## Mcnemar's Test P-Value : 0.02497
##
## Sensitivity : 0.9938
## Specificity : 0.2000
## Pos Pred Value : 0.9972
## Neg Pred Value : 0.1000
## Prevalence : 0.9966
## Detection Rate : 0.9904
## Detection Prevalence : 0.9932
## Balanced Accuracy : 0.5969
##
## 'Positive' Class : 0
##
k_cm <- confusionMatrix(data=k_predicted_value, reference = expected_value)
k_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4343 12
## 1 34 3
##
## Accuracy : 0.9895
## 95% CI : (0.9861, 0.9923)
## No Information Rate : 0.9966
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0.1111
##
## Mcnemar's Test P-Value : 0.00196
##
## Sensitivity : 0.99223
## Specificity : 0.20000
## Pos Pred Value : 0.99724
## Neg Pred Value : 0.08108
## Prevalence : 0.99658
## Detection Rate : 0.98884
## Detection Prevalence : 0.99158
## Balanced Accuracy : 0.59612
##
## 'Positive' Class : 0
##
m_cm <- confusionMatrix(data=m_predicted_value, reference = expected_value)
m_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4240 9
## 1 137 6
##
## Accuracy : 0.9668
## 95% CI : (0.961, 0.9719)
## No Information Rate : 0.9966
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0702
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.96870
## Specificity : 0.40000
## Pos Pred Value : 0.99788
## Neg Pred Value : 0.04196
## Prevalence : 0.99658
## Detection Rate : 0.96539
## Detection Prevalence : 0.96744
## Balanced Accuracy : 0.68435
##
## 'Positive' Class : 0
##
kmode_cm <- confusionMatrix(data=kmode_predicted_value, reference = expected_value)
kmode_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4116 15
## 1 261 0
##
## Accuracy : 0.9372
## 95% CI : (0.9296, 0.9442)
## No Information Rate : 0.9966
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0065
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9404
## Specificity : 0.0000
## Pos Pred Value : 0.9964
## Neg Pred Value : 0.0000
## Prevalence : 0.9966
## Detection Rate : 0.9372
## Detection Prevalence : 0.9406
## Balanced Accuracy : 0.4702
##
## 'Positive' Class : 0
##
ensemble_cm <- confusionMatrix(data=as.factor(as.numeric(ensemble_predicted_value)-1), reference = expected_value)
ensemble_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4005 9
## 1 372 6
##
## Accuracy : 0.9133
## 95% CI : (0.9045, 0.9214)
## No Information Rate : 0.9966
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0241
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.91501
## Specificity : 0.40000
## Pos Pred Value : 0.99776
## Neg Pred Value : 0.01587
## Prevalence : 0.99658
## Detection Rate : 0.91189
## Detection Prevalence : 0.91393
## Balanced Accuracy : 0.65751
##
## 'Positive' Class : 0
##
#compare to logistic regression
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
data_clusters2 = data_clusters[,c(2:5)]
set.seed(5205)
split = sample.split(data_clusters2$is_fraud, SplitRatio = 0.7)
train = data_clusters2[split,]
test = data_clusters2[!split,]
model = glm(is_fraud ~., data = train, family = 'binomial')
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred = predict(model, newdata = test, type = 'response')
expected_value2 <- factor(test$is_fraud)
glm_predicted_value <- factor(as.integer(pred>0.5))
glm_cm <- confusionMatrix(data=glm_predicted_value, reference = expected_value2)
glm_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1312 5
## 1 1 0
##
## Accuracy : 0.9954
## 95% CI : (0.9901, 0.9983)
## No Information Rate : 0.9962
## P-Value [Acc > NIR] : 0.7625
##
## Kappa : -0.0013
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9992
## Specificity : 0.0000
## Pos Pred Value : 0.9962
## Neg Pred Value : 0.0000
## Prevalence : 0.9962
## Detection Rate : 0.9954
## Detection Prevalence : 0.9992
## Balanced Accuracy : 0.4996
##
## 'Positive' Class : 0
##
data_map = filter(data, is_fraud == 1)
library(ggmap)
## Warning: package 'ggmap' was built under R version 4.1.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
register_google(key = 'AIzaSyDoFcnGofCofZb2RvD5Bqwnv3buSWarFws')
map = get_map(location=c(-95.7129,37.0902), zoom=4, scale=4)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=37.0902,-95.7129&zoom=4&size=640x640&scale=4&maptype=terrain&language=en-EN&key=xxx
ggmap(map)+
geom_point(data=data_map, aes(x=merch_long,y=merch_lat), size=0.5, alpha=0.5, color='red')
## Warning: Removed 17 rows containing missing values (geom_point).
data_map2 = data_map %>% group_by(state) %>% summarize('count' = n()) %>% arrange(desc(count))
data_map2
## # A tibble: 49 x 2
## state count
## <fct> <int>
## 1 NY 568
## 2 TX 555
## 3 PA 543
## 4 OH 348
## 5 IL 319
## 6 CA 289
## 7 AL 263
## 8 MO 262
## 9 MN 253
## 10 VA 240
## # ... with 39 more rows
merch_lat = data_indiv$merch_lat
merch_long = data_indiv$merch_long
data_spatial = cbind(data_clusters, merch_lat, merch_long)
data_spatial_fraud_pred = filter(data_spatial, m_is_fraud_pred == 1)
data_spatial_fraud = filter(data_spatial, is_fraud == 1)
library(ggmap)
register_google(key = 'AIzaSyDoFcnGofCofZb2RvD5Bqwnv3buSWarFws')
map = get_map(location=c(median(data_spatial$merch_long), median(data_spatial$merch_lat)), zoom=8, scale=4)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.669803,-91.023106&zoom=8&size=640x640&scale=4&maptype=terrain&language=en-EN&key=xxx
ggmap(map)+
geom_point(data=data_spatial, aes(x=merch_long,y=merch_lat), size=1, alpha=0.2, color='seagreen')+
geom_point(data=data_spatial_fraud_pred, aes(x=merch_long,y=merch_lat), size=1, alpha=1, color='red')
ggmap(map)+
geom_point(data=data_spatial, aes(x=merch_long,y=merch_lat), size=1, alpha=0.2, color='seagreen')+
geom_point(data=data_spatial_fraud, aes(x=merch_long,y=merch_lat), size=1, alpha=1, color='red')
# Identifying proportions of fradulent merchant categories
#from predicted
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
cat = table(data_spatial_fraud_pred$category)
cat2 = data.table(round(prop.table(cat)*100,2))
tap = tapply(data_spatial_fraud_pred$amt, data_spatial_fraud_pred$category, sum)
tap = as.data.frame.table(tap)
#data.table(tap)
cat2$sum = tap$Freq
colnames(cat2) = c('category', 'proportions', 'Sum of Purchases ($)')
cat2 = cat2 %>% arrange(desc(proportions))
colnames(cat2) = c('Merchant', 'Proportions (%)', 'Sum of Purchases ($)')
cat2
## Merchant Proportions (%) Sum of Purchases ($)
## 1: misc_net 18.18 13463.53
## 2: misc_pos 17.48 10992.80
## 3: shopping_pos 17.48 25191.93
## 4: shopping_net 10.49 10125.14
## 5: home 9.09 2810.56
## 6: personal_care 6.29 1870.20
## 7: entertainment 4.90 1436.96
## 8: food_dining 4.90 2117.86
## 9: kids_pets 4.90 1746.85
## 10: health_fitness 3.50 1136.30
## 11: grocery_pos 1.40 701.42
## 12: travel 1.40 1593.92
## 13: gas_transport 0.00 NA
## 14: grocery_net 0.00 NA
#from actual
library(data.table)
cat = table(data_spatial_fraud$category)
cat2 = data.table(round(prop.table(cat)*100,2))
tap = tapply(data_spatial_fraud$amt, data_spatial_fraud$category, sum)
tap = as.data.frame.table(tap)
#data.table(tap)
cat2$sum = tap$Freq
colnames(cat2) = c('category', 'proportions', 'Sum of Purchases ($)')
cat2 = cat2 %>% arrange(desc(proportions))
colnames(cat2) = c('Merchant', 'Proportions (%)', 'Sum of Purchases ($)')
cat2
## Merchant Proportions (%) Sum of Purchases ($)
## 1: food_dining 20.00 352.20
## 2: personal_care 20.00 58.26
## 3: grocery_pos 13.33 701.42
## 4: kids_pets 13.33 40.16
## 5: shopping_pos 13.33 2135.94
## 6: gas_transport 6.67 12.54
## 7: home 6.67 258.37
## 8: shopping_net 6.67 1004.38
## 9: entertainment 0.00 NA
## 10: grocery_net 0.00 NA
## 11: health_fitness 0.00 NA
## 12: misc_net 0.00 NA
## 13: misc_pos 0.00 NA
## 14: travel 0.00 NA